LOAD LIBRARIES

In [1]:
# Data Manipulation 
import numpy as np
import pandas as pd

# Visualization 
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.gridspec as gridspec
import matplotlib.style as style
style.use('fivethirtyeight')

#statistics
from scipy import stats
from scipy.stats import shapiro
from scipy.special import boxcox1p
from scipy.stats import boxcox_normmax
from statsmodels.stats.outliers_influence import variance_inflation_factor

#set maximum columns and rows
pd.set_option('display.max_columns',1000)

# Feature Selection and Encoding
from sklearn.feature_selection import RFE, RFECV
from sklearn.decomposition import PCA
from sklearn.preprocessing import OneHotEncoder, LabelEncoder,StandardScaler
from sklearn.preprocessing import RobustScaler

# Machine learning 
from sklearn import model_selection,preprocessing, metrics, linear_model
from sklearn.model_selection import StratifiedKFold , KFold
from sklearn.neighbors import KNeighborsRegressor
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, Lasso,RidgeCV,LassoCV,ElasticNetCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor,BaggingRegressor,AdaBoostRegressor
from xgboost.sklearn import XGBRegressor
import lightgbm as lgb
from sklearn.svm import SVR
from mlxtend.regressor import StackingCVRegressor
from sklearn.pipeline import make_pipeline


# Grid and Random Search
import scipy.stats as st
from scipy.stats import randint as sp_randint
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

# Metrics
from sklearn.metrics import precision_recall_fscore_support, roc_curve, auc,r2_score,mean_squared_error

# Managing Warnings 
import warnings
warnings.filterwarnings('ignore')

# Plot the Figures Inline
%matplotlib inline

#plotly
import plotly.graph_objects as go
import plotly.figure_factory as ff
import plotly.express as px
from plotly.subplots import make_subplots

LOAD TRAIN AND TEST DATA

In [2]:
train =pd.read_csv('train.csv')
In [3]:
test=pd.read_csv('test.csv')
In [4]:
def info(df):
    print(f"Dataset Shape: {df.shape}")
    summary = pd.DataFrame(df.dtypes,columns=['dtypes'])
    summary = summary.reset_index()
    summary['Name'] = summary['index']
    summary = summary[['Name','dtypes']]
    summary['Missing'] = df.isnull().sum().values
    summary['Total'] = len(df)   
    summary['Missing Percentage']=np.round(((summary['Missing']/summary['Total'])*100),2)
    summary['Uniques'] = df.nunique().values
    mask=summary[summary['Missing Percentage']>0].sort_values(by=['Missing Percentage'],ascending=False)
    plt.figure(figsize=(7,7))
    sns.barplot(x=mask['Name'],y=mask['Missing Percentage'])
    plt.xticks(rotation=90)
    plt.axhline(50)
    plt.show()
    return(summary) 
In [5]:
info(train)
Dataset Shape: (1460, 81)
Out[5]:
Name dtypes Missing Total Missing Percentage Uniques
0 Id int64 0 1460 0.00 1460
1 MSSubClass int64 0 1460 0.00 15
2 MSZoning object 0 1460 0.00 5
3 LotFrontage float64 259 1460 17.74 110
4 LotArea int64 0 1460 0.00 1073
... ... ... ... ... ... ...
76 MoSold int64 0 1460 0.00 12
77 YrSold int64 0 1460 0.00 5
78 SaleType object 0 1460 0.00 9
79 SaleCondition object 0 1460 0.00 6
80 SalePrice int64 0 1460 0.00 663

81 rows × 6 columns

In [6]:
info(test)
Dataset Shape: (1459, 80)
Out[6]:
Name dtypes Missing Total Missing Percentage Uniques
0 Id int64 0 1459 0.00 1459
1 MSSubClass int64 0 1459 0.00 16
2 MSZoning object 4 1459 0.27 5
3 LotFrontage float64 227 1459 15.56 115
4 LotArea int64 0 1459 0.00 1106
... ... ... ... ... ... ...
75 MiscVal int64 0 1459 0.00 26
76 MoSold int64 0 1459 0.00 12
77 YrSold int64 0 1459 0.00 5
78 SaleType object 1 1459 0.07 9
79 SaleCondition object 0 1459 0.00 6

80 rows × 6 columns

CHECKING WHETHER SALE PRICE(TARGET COLUMN) IS NORMALLY DISTRIBUTED OR NOT

In [7]:
fig = px.histogram(train, x="SalePrice",
                   marginal="box", # or violin, rug
                   hover_data=train)
fig.update_layout(title_text='SalePrice')
fig.show()
In [8]:
fig,ax = plt.subplots(constrained_layout=True,figsize=(10,7))
stats.probplot(train['SalePrice'],plot=ax)
plt.show()

STATISTICAL TEST (SHAPIRO) FOR CHECKING NORMALITY

HYPOTHESIS:

H0 (NULL HYPOTHESIS): DATA IS NORMALLY DISTRIBUTED

HA(ALTERNATE HYPOTHESIS): DATA IS NOT NORMALLY DISTRIBUTED

In [9]:
stat,pvalue=shapiro(train['SalePrice'])
In [10]:
print('p value',pvalue)
p value 3.206247534576162e-33

HERE p-value < alpha(0.05) SO REJECT NULL HYPOTHESIS. HENCE DATA IS NOT NORMALLY DISTRIBUTED.

In [11]:
print('Skewness:',train['SalePrice'].skew())
Skewness: 1.8828757597682129

Here The Skewness value is positive so The Target column (SalePrice) is Right skewed

MEAN > MEDIAN

LOG TRANSFORMATION FOR TARGET COLUMN

In [12]:
train['SalePrice']=np.log1p(train['SalePrice'])
In [13]:
fig,ax = plt.subplots(constrained_layout=True,figsize=(10,7))
stats.probplot(train['SalePrice'],plot=ax)
plt.show()

OUTLIERS

In [14]:
fig = px.histogram(train, x="GrLivArea",
                   marginal="box", # or violin, rug
                   hover_data=train)
fig.update_layout(title_text='GrLivArea')
fig.show()
In [15]:
# Deleting outliers
train = train[train['GrLivArea'] < 4500]
In [16]:
train.reset_index(drop=True, inplace=True)
In [17]:
train.shape
Out[17]:
(1458, 81)

CONCAT TRAIN AND TEST DATA

In [18]:
df=pd.concat([train,test],axis=0)
In [19]:
df.shape
Out[19]:
(2917, 81)
In [20]:
df.drop(['SalePrice','Id'],axis=1,inplace=True)
In [21]:
info(df)
Dataset Shape: (2917, 79)
Out[21]:
Name dtypes Missing Total Missing Percentage Uniques
0 1stFlrSF int64 0 2917 0.00 1081
1 2ndFlrSF int64 0 2917 0.00 633
2 3SsnPorch int64 0 2917 0.00 31
3 Alley object 2719 2917 93.21 2
4 BedroomAbvGr int64 0 2917 0.00 8
... ... ... ... ... ... ...
74 Utilities object 2 2917 0.07 2
75 WoodDeckSF int64 0 2917 0.00 379
76 YearBuilt int64 0 2917 0.00 118
77 YearRemodAdd int64 0 2917 0.00 61
78 YrSold int64 0 2917 0.00 5

79 rows × 6 columns

CONVERT DATATYPES OF FEATURES

In [22]:
df['MSSubClass'] = df['MSSubClass'].apply(str)
df['YrSold'] = df['YrSold'].astype(str)
df['MoSold'] = df['MoSold'].astype(str)

IMPUTATION OF MISSSING VALUES IN CATEGORICAL COLUMNS

In [23]:
df['Functional'] = df['Functional'].fillna('Typ')
df['Electrical'] = df['Electrical'].fillna("SBrkr")
df['KitchenQual'] = df['KitchenQual'].fillna("TA")
df['Exterior1st'] = df['Exterior1st'].fillna(df['Exterior1st'].mode()[0])
df['Exterior2nd'] = df['Exterior2nd'].fillna(df['Exterior2nd'].mode()[0])
df['SaleType'] = df['SaleType'].fillna(df['SaleType'].mode()[0])
In [24]:
for col in ('GarageYrBlt', 'GarageArea', 'GarageCars'):
    df[col] = df[col].fillna(0)
for col in ['GarageType', 'GarageFinish', 'GarageQual', 'GarageCond']:
    df[col] = df[col].fillna('None')
for col in ('BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2'):
    df[col] = df[col].fillna('None')
In [25]:
df['MSZoning'] = df.groupby('MSSubClass')['MSZoning'].transform(lambda x: x.fillna(x.mode()[0]))
In [26]:
objects = []
for i in df.columns:
    if df[i].dtype == object:
        objects.append(i)
In [27]:
df.update(df[objects].fillna('None'))
In [28]:
df['LotFrontage'] = df.groupby('Neighborhood')['LotFrontage'].transform(lambda x: x.fillna(x.median()))

IMPUTATION OF MISSING VALUES IN NUMERICAL COLUMNS

In [29]:
numeric_dtypes = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
numerics = []
for i in df.columns:
    if df[i].dtype in numeric_dtypes:
        numerics.append(i)
df.update(df[numerics].fillna(0))

CHECKING SKEWNESS OF NUMERICAL FEATURES

In [30]:
numeric_dtypes = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
numerics2 = []
for i in df.columns:
    if df[i].dtype in numeric_dtypes:
        numerics2.append(i)
In [31]:
skew_features=df[numerics2].skew()
In [32]:
high_skew = skew_features[skew_features > 0.5]
skew_index = high_skew.index
In [33]:
for i in skew_index:
    df[i] = boxcox1p(df[i], boxcox_normmax(df[i] + 1))

DROPPING FEATURES

In [34]:
df = df.drop(['Utilities', 'Street', 'PoolQC',], axis=1)

CREATING NEW FEATURES

In [35]:
df['YrBltAndRemod']=df['YearBuilt']+df['YearRemodAdd']
df['TotalSF']=df['TotalBsmtSF'] + df['1stFlrSF'] + df['2ndFlrSF']

df['Total_sqr_footage'] = (df['BsmtFinSF1'] + df['BsmtFinSF2'] +
                                 df['1stFlrSF'] + df['2ndFlrSF'])

df['Total_Bathrooms'] = (df['FullBath'] + (0.5 * df['HalfBath']) +
                               df['BsmtFullBath'] + (0.5 * df['BsmtHalfBath']))

df['Total_porch_sf'] = (df['OpenPorchSF'] + df['3SsnPorch'] +
                              df['EnclosedPorch'] + df['ScreenPorch'] +
                              df['WoodDeckSF'])
In [36]:
df['haspool'] = df['PoolArea'].apply(lambda x: 1 if x > 0 else 0)
df['has2ndfloor'] = df['2ndFlrSF'].apply(lambda x: 1 if x > 0 else 0)
df['hasgarage'] = df['GarageArea'].apply(lambda x: 1 if x > 0 else 0)
df['hasbsmt'] = df['TotalBsmtSF'].apply(lambda x: 1 if x > 0 else 0)
df['hasfireplace'] = df['Fireplaces'].apply(lambda x: 1 if x > 0 else 0)
In [37]:
print(df.shape)
(2917, 86)

CREATING GET DUMMIES FOR CATEGORICAL COLUMNS

In [38]:
df=pd.get_dummies(df)
In [39]:
print(df.shape)
(2917, 333)
In [40]:
train.shape
Out[40]:
(1458, 81)

SPLIT TRAIN AND TEST DATAFRAME

In [41]:
Train=df.iloc[0:1458,:]
Test=df.iloc[1458:,:]
In [42]:
Train['SalePrice']=train['SalePrice']

SPLIT DEPENDENT AND INDEPENDENT FEATURES

In [43]:
X=Train.drop('SalePrice',axis=1)
y=Train['SalePrice']
In [44]:
outliers = [30, 88, 462, 631, 1322]
X = X.drop(X.index[outliers])
y = y.drop(y.index[outliers])

FINDING OVERFITING FEATURES

In [45]:
overfit = []
for i in X.columns:
    counts = X[i].value_counts()
    zeros = counts.iloc[0]
    if zeros / len(X) * 100 > 99.94:
        overfit.append(i)

overfit = list(overfit)
overfit.append('MSZoning_C (all)')
In [46]:
overfit
Out[46]:
['MSSubClass_150', 'MSZoning_C (all)']
In [47]:
X = X.drop(overfit, axis=1)
Test = Test.drop(overfit, axis=1)
In [48]:
print('X', X.shape, 'y', y.shape, 'Test', Test.shape)
X (1453, 331) y (1453,) Test (1459, 331)

MODEL BUILDING

In [49]:
#CREATE PIPELINE

alphas_alt = [14.5, 14.6, 14.7, 14.8, 14.9, 15, 15.1, 15.2, 15.3, 15.4, 15.5]
alphas2 = [5e-05, 0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007, 0.0008]
e_alphas = [0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007]
e_l1ratio = [0.8, 0.85, 0.9, 0.95, 0.99, 1]

ridge = make_pipeline(RobustScaler(),
                      RidgeCV(alphas=alphas_alt))

lasso = make_pipeline(RobustScaler(),
                      LassoCV(max_iter=1e7, alphas=alphas2,
                              random_state=42))

elasticnet = make_pipeline(RobustScaler(),
                           ElasticNetCV(max_iter=1e7, alphas=e_alphas,
                                         random_state=42, l1_ratio=e_l1ratio))
                                        
svr = make_pipeline(RobustScaler(),
                      SVR(C= 20, epsilon= 0.008, gamma=0.0003,))


gbr = make_pipeline(RobustScaler(),GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05,
                                   max_depth=4, max_features='sqrt',
                                   min_samples_leaf=15, min_samples_split=10, 
                                   loss='huber', random_state =42))
                                   

lightgbm =make_pipeline(RobustScaler(), lgb.LGBMRegressor(objective='regression', 
                                       num_leaves=4,
                                       learning_rate=0.01, 
                                       n_estimators=5000,
                                       max_bin=200, 
                                       bagging_fraction=0.75,
                                       bagging_freq=5, 
                                       bagging_seed=7,
                                       feature_fraction=0.2,
                                       feature_fraction_seed=7,
                                       verbose=-1,
                                       #min_data_in_leaf=2,
                                       #min_sum_hessian_in_leaf=11
                                       ))
                                       

xgboost = make_pipeline(RobustScaler(),XGBRegressor(learning_rate=0.01, n_estimators=3460,
                                     max_depth=3, min_child_weight=0,
                                     gamma=0, subsample=0.7,
                                     colsample_bytree=0.7,
                                     objective='reg:linear', nthread=-1,
                                     scale_pos_weight=1, seed=27,
                                     reg_alpha=0.00006, random_state=42))

# stacking
stack_gen = StackingCVRegressor(regressors=(ridge, lasso, elasticnet,
                                            gbr, xgboost, lightgbm),
                                meta_regressor=xgboost,
                                use_features_in_secondary=True)
In [50]:
pipelines = []
pipelines.append(('ScaledLASSO', lasso))
pipelines.append(('Scaledelasticnet', elasticnet))
pipelines.append(('Scaledsvr', svr))
pipelines.append(('Scaledgbr', gbr))
pipelines.append(('Scaledlightgbm', lightgbm))
#pipelines.append(('Scaledstack_gen', stack_gen))
In [51]:
for name,model in pipelines:
        results=[]
        names=[]
        kfold = model_selection.KFold(n_splits=10, random_state=1,)
        cv_results = model_selection.cross_val_score(model, X, y, cv=kfold,scoring = 'neg_mean_squared_error')
        results.append(np.sqrt(-cv_results))
        names.append(name)
        print(name)
        mse_scores = abs(cv_results)
        rmse_scores = np.sqrt(mse_scores)
        print(rmse_scores)
        print('BIAS ERROR:',rmse_scores.mean())
        print('VARIANCE ERROR:',np.var(rmse_scores,ddof=1))
        results.append(rmse_scores)
        names.append(name)
ScaledLASSO
[0.09426417 0.09356918 0.10024579 0.10965648 0.12358365 0.10271094
 0.11460757 0.0882094  0.09545795 0.10705228]
BIAS ERROR: 0.10293574026300249
VARIANCE ERROR: 0.0001186310650325597
Scaledelasticnet
[0.09403123 0.0935852  0.10046532 0.10965648 0.12358365 0.10266878
 0.11455245 0.08822193 0.0953599  0.10635533]
BIAS ERROR: 0.10284802734596536
VARIANCE ERROR: 0.00011831732561188629
Scaledsvr
[0.08653742 0.08668538 0.09750238 0.11386191 0.12377477 0.10515096
 0.11523996 0.09100452 0.09199886 0.09942922]
BIAS ERROR: 0.10111853828320298
VARIANCE ERROR: 0.00016818785606736859
Scaledgbr
[0.09746151 0.09383563 0.10164315 0.12322454 0.12253975 0.1027489
 0.11782271 0.09370683 0.09806595 0.11065868]
BIAS ERROR: 0.1061707645064445
VARIANCE ERROR: 0.00013288191840493148
Scaledlightgbm
[0.09730699 0.09387751 0.09842104 0.11740984 0.12513174 0.09735992
 0.11887472 0.09535321 0.09869681 0.11486619]
BIAS ERROR: 0.10572979780110092
VARIANCE ERROR: 0.00014012788570129522
In [52]:
from datetime import datetime
print('START Fit')
print(datetime.now(), 'StackingCVRegressor')
stack_gen_model = stack_gen.fit(np.array(X), np.array(y))
print(datetime.now(), 'elasticnet')
elastic_model_full_data = elasticnet.fit(X, y)
print(datetime.now(), 'lasso')
lasso_model_full_data = lasso.fit(X, y)
print(datetime.now(), 'ridge')
ridge_model_full_data = ridge.fit(X, y)
print(datetime.now(), 'svr')
svr_model_full_data = svr.fit(X, y)
print(datetime.now(), 'GradientBoosting')
gbr_model_full_data = gbr.fit(X, y)
print(datetime.now(), 'xgboost')
xgb_model_full_data = xgboost.fit(X, y)
print(datetime.now(), 'lightgbm')
lgb_model_full_data = lightgbm.fit(X, y)
START Fit
2020-01-27 06:57:03.223215 StackingCVRegressor
[06:58:15] WARNING: src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
[06:58:26] WARNING: src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
[06:58:37] WARNING: src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
[06:58:47] WARNING: src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
[06:58:57] WARNING: src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
[06:59:18] WARNING: src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
[06:59:46] WARNING: src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
2020-01-27 07:00:00.908378 elasticnet
2020-01-27 07:00:04.511584 lasso
2020-01-27 07:00:05.817659 ridge
2020-01-27 07:00:06.053672 svr
2020-01-27 07:00:07.011727 GradientBoosting
2020-01-27 07:00:16.783286 xgboost
[07:00:16] WARNING: src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
2020-01-27 07:00:29.429009 lightgbm
In [53]:
# FUNCTION TO CALCULATE RMSE
def rmse(y, y_pred):
    return np.sqrt(mean_squared_error(y, y_pred))

BLEND MODELS

In [54]:
def blend_models_predict(X):
    return ((0.1 * elastic_model_full_data.predict(X)) + \
            (0.05 * lasso_model_full_data.predict(X)) + \
            (0.1 * ridge_model_full_data.predict(X)) + \
            (0.1 * svr_model_full_data.predict(X)) + \
            (0.1 * gbr_model_full_data.predict(X)) + \
            (0.15 * xgb_model_full_data.predict(X)) + \
            (0.1 * lgb_model_full_data.predict(X)) + \
            (0.3 * stack_gen_model.predict(np.array(X))))
            
print('RMSE score on train data:')
print(rmse(y, blend_models_predict(X)))
RMSE score on train data:
0.05547466491742177
In [55]:
print('Predict submission')
y_pred=blend_models_predict(Test)
Predict submission
In [56]:
y_pred1=np.expm1(y_pred)
In [57]:
y_pred1
Out[57]:
array([123842.55987694, 159074.31155813, 187080.35884928, ...,
       166622.62556584, 115661.62084274, 214231.09092679])
In [58]:
test['SalePrice']=y_pred1
In [59]:
a=test.loc[:,['Id','SalePrice']]
print(a)
        Id      SalePrice
0     1461  123842.559877
1     1462  159074.311558
2     1463  187080.358849
3     1464  200604.530936
4     1465  188708.256809
...    ...            ...
1454  2915   83860.359064
1455  2916   80797.538577
1456  2917  166622.625566
1457  2918  115661.620843
1458  2919  214231.090927

[1459 rows x 2 columns]
In [60]:
#a.to_csv('Submission17.csv',index=False)